rm(list = ls())

library(naniar)
library(ggplot2)
library(reshape2)
library(magrittr)
library(dplyr)
library(car)
library(GGally)
library(viridis)
library(caret)
library(ggplot2)
library(cowplot)
library(FNN)
library(MASS)
library(rpart)
library(rpart.plot)
library(adabag)
library(randomForest)
library(imbalance)
library(e1071)
library(neuralnet)
library(nnet)

# setwd("~/GitHub/CVTDM_Project")
wine = read.csv(file = "winequality-white.csv", header = T, sep = ";")

Data Exploratory Analysis

First Insights

First, let’s have a look at the variables and their types:

str(wine)
## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...

Now, let’s create a binned_quality variable for the purpose of the problem:

sapply(wine, function(x) length(unique(x))) 
##        fixed.acidity     volatile.acidity          citric.acid 
##                   68                  125                   87 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##                  310                  160                  132 
## total.sulfur.dioxide              density                   pH 
##                  251                  890                  103 
##            sulphates              alcohol              quality 
##                   79                  103                    7
wine$binned_quality = as.factor(ifelse(wine$quality < 5, 'Low',
                                ifelse(wine$quality >= 5 & wine$quality < 7, "Intermediate",
                                ifelse(wine$quality >= 7, "High", "None"))))

wine$quality = as.factor(wine$quality)

We can also have a look at the summary statistics:

summary(wine)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.300   1st Qu.:0.2100   1st Qu.:0.2700   1st Qu.: 1.700  
##  Median : 6.800   Median :0.2600   Median :0.3200   Median : 5.200  
##  Mean   : 6.855   Mean   :0.2782   Mean   :0.3342   Mean   : 6.391  
##  3rd Qu.: 7.300   3rd Qu.:0.3200   3rd Qu.:0.3900   3rd Qu.: 9.900  
##  Max.   :14.200   Max.   :1.1000   Max.   :1.6600   Max.   :65.800  
##                                                                     
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  2.00      Min.   :  9.0        Min.   :0.9871  
##  1st Qu.:0.03600   1st Qu.: 23.00      1st Qu.:108.0        1st Qu.:0.9917  
##  Median :0.04300   Median : 34.00      Median :134.0        Median :0.9937  
##  Mean   :0.04577   Mean   : 35.31      Mean   :138.4        Mean   :0.9940  
##  3rd Qu.:0.05000   3rd Qu.: 46.00      3rd Qu.:167.0        3rd Qu.:0.9961  
##  Max.   :0.34600   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##                                                                             
##        pH          sulphates         alcohol      quality       binned_quality
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   3:  20   High        :1060  
##  1st Qu.:3.090   1st Qu.:0.4100   1st Qu.: 9.50   4: 163   Intermediate:3655  
##  Median :3.180   Median :0.4700   Median :10.40   5:1457   Low         : 183  
##  Mean   :3.188   Mean   :0.4898   Mean   :10.51   6:2198                      
##  3rd Qu.:3.280   3rd Qu.:0.5500   3rd Qu.:11.40   7: 880                      
##  Max.   :3.820   Max.   :1.0800   Max.   :14.20   8: 175                      
##                                                   9:   5
sapply(wine[,-c(12,13)], sd)
##        fixed.acidity     volatile.acidity          citric.acid 
##          0.843868228          0.100794548          0.121019804 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##          5.072057784          0.021847968         17.007137325 
## total.sulfur.dioxide              density                   pH 
##         42.498064554          0.002990907          0.151000600 
##            sulphates              alcohol 
##          0.114125834          1.230620568
str(wine)
## 'data.frame':    4898 obs. of  13 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : Factor w/ 7 levels "3","4","5","6",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ binned_quality      : Factor w/ 3 levels "High","Intermediate",..: 2 2 2 2 2 2 2 2 2 2 ...

Visualisation & Distributions

Let’s look at the distribution of the variables based on the quality variable (not binned, on a 0-10 scale):

boxplots = ggplot(data = melt(wine[,-13], "quality"), aes(quality, value, group = quality)) + 
  geom_boxplot(fill = "transparent", color = "black") + 
  facet_wrap(~variable, scale = "free", ncol = 3) +
  theme_classic()

boxplots

First, one can see that there are not a lot of variation between wine of different quality. There is a slight increase in alcohol quantity for wine of better quality, as well as a slight increase in pH levels.

Let’s continue by exploring the distribution of each variable:

par(mfrow=c(2, 3))
for (i in 1:11) {
  hist(wine[, i], main = c(names(wine[i])), xlab=names(wine[i]))
  abline(v = mean(wine[, i]), col = 1, lwd = 2)
  abline(v = median(wine[, i]), col = 2, lwd = 2)
}

par(mfrow=c(1, 1))

barplot(table(wine$binned_quality), main = c(names(wine$binned_quality)), xlab=names(wine$binned_quality))

Our dataset faces two problems we have to deal with: imbalance between quality groups (‘Intermediate’ quality is over-represented) and skewness in the distribution of the explanatory variables.

To correct for the skewness, we can log-transform the variables of interest:

alllogwine = wine
alllogwine[,-c(12,13)] = lapply(alllogwine[,-c(12,13)], log) #log transform all variables except quality and binned quality

boxplots = ggplot(data = melt(alllogwine[,-13], "quality"), aes(quality, value, group = quality)) + 
  geom_boxplot(fill = "transparent", color = "black") + 
  facet_wrap(~variable, scale = "free", ncol = 3) +
  theme_classic()

boxplots

Let’s have a look at the distribution of the variables after log-tranformation:

par(mfrow=c(2, 3))
for (i in 1:11) {
  hist(alllogwine[, i], main = c(names(alllogwine[i])), xlab=names(alllogwine[i]))
  abline(v = mean(alllogwine[, i]), col = 1, lwd = 2)
  abline(v = median(alllogwine[, i]), col = 2, lwd = 2)
}

Clearly, we see an improvement in the distribution of the variables, with less variability due to extreme observations. There are still some variables for which the transformation does not add much: citric.acid, total.sulfure.dioxide and pH. Indeed, these variables were already pretty well distributed. Also, note that density is still right-skewed.

Correlation Between Explanatory Variables

Now, let’s have a look at the correlation between the explanatory variables:

cor_mat = round(cor(wine[,-c(12,13)]),2) 
cor_mat2 = melt(cor_mat)

ggplot(data = cor_mat2, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile() +
  geom_text(aes(Var2, Var1, label = value), color = "white", size = 3) +
  labs(title = "Heatmap of the correlation table") +
  theme(axis.text.x = element_text(angle=90))

We notice a high negative correlation coefficient between alcohol and density (-0.78). Because it is generally better to reduce the number of dimensions and because a high correlation might lead to multicollinearity issues, one can decide to drop the density variable.

Let’s look at the VIFs (Variance Inflation Factors) by performing a linear regression:

wine$quality = as.numeric(wine$quality)

model = lm(quality ~., data = wine[,-13])
vif(model)
##        fixed.acidity     volatile.acidity          citric.acid 
##             2.691435             1.141156             1.165215 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##            12.644064             1.236822             1.787880 
## total.sulfur.dioxide              density                   pH 
##             2.239233            28.232546             2.196362 
##            sulphates              alcohol 
##             1.138540             7.706957

It appears that the density variable has a very high VIF (28.2) compared to other variables. Hence, let’s have a look at the model without density and see if there is an improvement:

model2 = lm(quality ~., data = wine[,-c(8,13)])
vif(model2)
##        fixed.acidity     volatile.acidity          citric.acid 
##             1.356128             1.128298             1.159884 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##             1.435215             1.203645             1.744627 
## total.sulfur.dioxide                   pH            sulphates 
##             2.153170             1.330912             1.056637 
##              alcohol 
##             1.647117
wine$quality = as.factor(wine$quality)

Clearly, the VIF of alcohol is lower compared to before (7.7 vs. 1.6). Thus, we will drop density from our models.

Finally, let’s look at this plot:

ggparcoord(wine, columns = 1:11, groupColumn = 13, showPoints = TRUE, alphaLines = 0.3, scale = "uniminmax") + scale_color_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90))

Data pre-processing

Variable selection and log-transformation

First, let’s drop density and quality:

logwine = wine[,-c(8,12)]

Now, let’s log transform all variables except citric.acid, total.sulfure.dioxide, pH and binned_quality:

logwine[,-c(3,7,8,11)] = lapply(logwine[,-c(3,7,8,11)], log) #log transform all variables except citric.acid, total.sulfure.dioxide, pH and binned_quality
head(logwine) 
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1      1.945910        -1.309333        0.36      3.0301337 -3.101093
## 2      1.840550        -1.203973        0.34      0.4700036 -3.015935
## 3      2.091864        -1.272966        0.40      1.9315214 -2.995732
## 4      1.974081        -1.469676        0.32      2.1400662 -2.847312
## 5      1.974081        -1.469676        0.32      2.1400662 -2.847312
## 6      2.091864        -1.272966        0.40      1.9315214 -2.995732
##   free.sulfur.dioxide total.sulfur.dioxide   pH  sulphates  alcohol
## 1            3.806662                  170 3.00 -0.7985077 2.174752
## 2            2.639057                  132 3.30 -0.7133499 2.251292
## 3            3.401197                   97 3.26 -0.8209806 2.312535
## 4            3.850148                  186 3.19 -0.9162907 2.292535
## 5            3.850148                  186 3.19 -0.9162907 2.292535
## 6            3.401197                   97 3.26 -0.8209806 2.312535
##   binned_quality
## 1   Intermediate
## 2   Intermediate
## 3   Intermediate
## 4   Intermediate
## 5   Intermediate
## 6   Intermediate

Data partitioning

We can now proceed to the partitioning of the data, with a training set (50%), validation set (30%) and a test set (20%):

set.seed(1)
train_index = createDataPartition(logwine$binned_quality, p = .5, list = FALSE)
train_df = logwine[train_index,]
valid_test_df = logwine[-train_index,]
valid_index = createDataPartition(valid_test_df$binned_quality, p = .6, list = FALSE)
valid_df = valid_test_df[valid_index,]
test_df = valid_test_df[-valid_index,]

Data normalization

Because we use some classification techniques that require the data to be on a same scale (kNN, Neural Nets and Logistic Regression), we normalize the data on a [0,1] scale:

#initialize normalized training and validation data frames to the original ones
train_norm_df = train_df
valid_norm_df = valid_df
test_norm_df = test_df

#use PreProcess() from the caret package and predict() to normalize numerical variables
norm_values = preProcess(train_df[,-c(11)], method = "range")
train_norm_df[,-c(11)] = predict(norm_values, train_df[,-c(11)])
valid_norm_df[,-c(11)] = predict(norm_values, valid_df[,-c(11)])
test_norm_df[,-c(11)] = predict(norm_values, test_df[,-c(11)])

Dummyfication of the outcome variable

Because we perform multi-label classification, and because the neuralnet package requires the outcome variable to be coded as (0, 1) vector, we create 3 dummies, one for every quality level (‘High’, ‘Intermediate’ and ‘Low’). Note that we also apply this to validation and training sets:

# Initialize normalized training and validation data frames with dummies to the normalized ones
train_norm_dummy_df <- train_norm_df
valid_norm_dummy_df <- valid_norm_df
test_norm_dummy_df <- test_norm_df

# Creation of the vectors and implementation
train_norm_dummy_df$high_quality <- ifelse(train_norm_dummy_df$binned_quality == "High", 1, 0)
train_norm_dummy_df$intermediate_quality <- ifelse(train_norm_dummy_df$binned_quality == "Intermediate", 1, 0)
train_norm_dummy_df$low_quality <- ifelse(train_norm_dummy_df$binned_quality == "Low", 1, 0)

valid_norm_dummy_df$high_quality <- ifelse(valid_norm_dummy_df$binned_quality == "High", 1, 0)
valid_norm_dummy_df$intermediate_quality <- ifelse(valid_norm_dummy_df$binned_quality == "Intermediate", 1, 0)
valid_norm_dummy_df$low_quality <- ifelse(valid_norm_dummy_df$binned_quality == "Low", 1, 0)

test_norm_dummy_df$high_quality <- ifelse(test_norm_dummy_df$binned_quality == "High", 1, 0)
test_norm_dummy_df$intermediate_quality <- ifelse(test_norm_dummy_df$binned_quality == "Intermediate", 1, 0)
test_norm_dummy_df$low_quality <- ifelse(test_norm_dummy_df$binned_quality == "Low", 1, 0)

Oversampling

As stated earlier in the Data Exploratory Analysis, we face an imbalanced dataset:

table(logwine$binned_quality)
## 
##         High Intermediate          Low 
##         1060         3655          183
prop.table(table(logwine$binned_quality))
## 
##         High Intermediate          Low 
##   0.21641486   0.74622295   0.03736219

We see that ‘Intermediate’ quality represents 74.6% of observations, whereas ‘High’ and ‘Low’ quality only represent 21.6% and 3.7% of observations (respectively). Thus, the classification models could have problems in predicting ‘High’, and even more importantly ‘Low’ observations, as they only represent 3.7% of observations.

To solve this problem, we can oversample the ‘Low’ observations and add them into the training set. We use Random Walk Oversampling, as this method is known to be one of the most effective, especially when facing such an imbalanced dataset (Huaxiang Zhang, Mingfang Li, 2014).

Now, let’s create 1736 new instances for ‘Low’ and 1298 instances for ‘High’ quality (such that we have the same number of observations in each category within the training set) and let’s add them to the training set:

set.seed(1)
add_low_train_df = rwo(train_df, 1736, "binned_quality") # Generation of 1736 instances for 'Low'
os_train_df = rbind(train_df, add_low_train_df) # Combining the new instances to the training set
set.seed(1)
add_high_train_df = rwo(os_train_df, 1298, "binned_quality") # Generation of 1298 instances for 'High'
os_train_df = rbind(os_train_df, add_high_train_df)

Let’s normalize the newly generated observations, and add them to the normalize training set:

os_train_norm_df = os_train_df

#use PreProcess() from the caret package and predict() to normalize numerical variables
os_norm_values = preProcess(os_train_df[,-c(11)], method = "range")
os_train_norm_df[,-c(11)] = predict(os_norm_values, os_train_df[,-c(11)])

We can also add these newly generated observations to the normalized training set with dummies for quality (used for Neural Nets):

# Initialize oversampled normalized training set with dummies to the oversampled normalized one
os_train_norm_dummy_df <- os_train_norm_df

os_train_norm_dummy_df$high_quality <- ifelse(os_train_norm_dummy_df$binned_quality == "High", 1, 0)
os_train_norm_dummy_df$intermediate_quality <- ifelse(os_train_norm_dummy_df$binned_quality == "Intermediate", 1, 0)
os_train_norm_dummy_df$low_quality <- ifelse(os_train_norm_dummy_df$binned_quality == "Low", 1, 0)

Analysis

In this part, we will proceed to the analysis using the following classification techniques: kNN, Ordinal Logistic Regression, Naive Bayes, Classification Trees (fitted to CP, Boosted Tree, Bagged Tree and Random Forest) and Neural Nets. Finally, we will combine the results to produce two ensemble methods based on Majority Voting and Mean Probability rules.

Note that we will use the training sets with oversampling on ‘Low’ quality observations for all methods.

kNN

Let’s proceed to the kNN:

#initialize a new data frame with three columns: k, kappa, and balanced_accuracy
best_k_df = data.frame(k = seq(1, 50, 1), kappa = rep(0,50), balanced_accuracy = rep(0,50))

#perform knn on the validation set using different k then store kappa and balanced accuracy for each k in the data frame
for(i in 1:50) {
  knn_pred = knn(train = os_train_norm_df[,-11], test = valid_norm_df[,-11], cl = os_train_norm_df[,11], k = i)
  best_k_df[i, 2] = confusionMatrix(knn_pred, valid_norm_df[,11])$overall[2]
  low_sensitivity = confusionMatrix(knn_pred, valid_norm_df[,11])$byClass[3,1]
  intermediate_sensitivity = confusionMatrix(knn_pred, valid_norm_df[,11])$byClass[2,1]
  high_sensitivity = confusionMatrix(knn_pred, valid_norm_df[,11])$byClass[1,1]
  best_k_df[i, 3] = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
}

kappa_plot = ggplot(data= best_k_df) + geom_line(aes(x=k,y=kappa), color="red") + theme_classic()
balanced_accuray_plot = ggplot(data= best_k_df) + geom_line(aes(x=k,y=balanced_accuracy), color="blue") + theme_classic()

knn_plot = plot_grid(kappa_plot, balanced_accuray_plot, ncol = 1, align = "v")
knn_plot

which.max(best_k_df$kappa)#best k based on kappa
## [1] 1
which.max(best_k_df$balanced_accuracy)#best k based on balanced accuracy
## [1] 28

From the plots above, it seems that the best number of neighbours (i.e., number of k’s) is k=1 if we want to maximize the balanced accuracy and the kappa value.

Choosing this parameter(k=1), we can assess the predictive performance of the kNN model on the test set:

#perform knn classification on the test set using best k = 1
set.seed(1)
best_knn_pred = knn3Train(train = os_train_norm_df[,-11], test = test_norm_df[,-11], cl = os_train_norm_df[,11], k = 1, prob = T)
best_knn_cm <- confusionMatrix(as.factor(best_knn_pred), test_norm_df[,11])#create corresponding confusion matrix 
best_knn_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          141          114   2
##   Intermediate   51          519  16
##   Low            20           97  18
## 
## Overall Statistics
##                                          
##                Accuracy : 0.6933         
##                  95% CI : (0.6633, 0.722)
##     No Information Rate : 0.7464         
##     P-Value [Acc > NIR] : 0.9999         
##                                          
##                   Kappa : 0.3749         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.6651              0.7110    0.50000
## Specificity               0.8486              0.7298    0.87580
## Pos Pred Value            0.5486              0.8857    0.13333
## Neg Pred Value            0.9015              0.4617    0.97865
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1442              0.5307    0.01840
## Detection Prevalence      0.2628              0.5992    0.13804
## Balanced Accuracy         0.7568              0.7204    0.68790
kappa_best_knn = best_knn_cm$overall[2]
kappa_best_knn
##     Kappa 
## 0.3748935
low_sensitivity = best_knn_cm$byClass[3,1]
intermediate_sensitivity = best_knn_cm$byClass[2,1]
high_sensitivity = best_knn_cm$byClass[1,1]

bal_acc_best_knn = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_best_knn
## [1] 0.6253511

Logistic regression

os_log_reg = multinom(binned_quality ~ ., data = os_train_df)
## # weights:  36 (22 variable)
## initial  value 6024.789791 
## iter  10 value 4983.359463
## iter  20 value 4393.575474
## iter  30 value 4287.516936
## iter  30 value 4287.516926
## iter  30 value 4287.516926
## final  value 4287.516926 
## converged
summary(os_log_reg)
## Call:
## multinom(formula = binned_quality ~ ., data = os_train_df)
## 
## Coefficients:
##              (Intercept) fixed.acidity volatile.acidity citric.acid
## Intermediate    31.86889     -0.962256         1.332421    1.997239
## Low             41.98841      1.637545         3.410483    2.390439
##              residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Intermediate     -0.2389481  1.275859          -0.4741764         0.0001637821
## Low              -0.4902087  1.084144          -2.0908903         0.0007713492
##                      pH   sulphates    alcohol
## Intermediate -0.9102455 -0.49837427  -8.518968
## Low           0.3831240  0.04088377 -13.564077
## 
## Std. Errors:
##              (Intercept) fixed.acidity volatile.acidity citric.acid
## Intermediate  0.03791899     0.2653690        0.1202564   0.3810659
## Low           0.05520153     0.3293996        0.1469022   0.4245366
##              residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Intermediate     0.04616474 0.1677127          0.07507555         0.0002866020
## Low              0.05390061 0.1941242          0.08257761         0.0002893369
##                     pH sulphates   alcohol
## Intermediate 0.1995491 0.1552705 0.3660252
## Low          0.2476004 0.1893426 0.4563050
## 
## Residual Deviance: 8575.034 
## AIC: 8619.034
os_log_prob = predict(os_log_reg, newdata = test_df, type = "p")

os_log_pred = data.frame("pred" = colnames(os_log_prob)[apply(os_log_prob,1,which.max)])
os_log_pred$pred = as.factor(os_log_pred$pred)

os_log_cm <- confusionMatrix(os_log_pred$pred, test_df[,11])
os_log_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          158          217   2
##   Intermediate   41          363   8
##   Low            13          150  26
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5593          
##                  95% CI : (0.5275, 0.5907)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2592          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.7453              0.4973    0.72222
## Specificity               0.7141              0.8024    0.82696
## Pos Pred Value            0.4191              0.8811    0.13757
## Neg Pred Value            0.9101              0.3516    0.98733
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1616              0.3712    0.02658
## Detection Prevalence      0.3855              0.4213    0.19325
## Balanced Accuracy         0.7297              0.6498    0.77459
kappa_logist = os_log_cm$overall[2]
kappa_logist
##     Kappa 
## 0.2591899
low_sensitivity = os_log_cm$byClass[3,1]
intermediate_sensitivity = os_log_cm$byClass[2,1]
high_sensitivity = os_log_cm$byClass[1,1]

bal_acc_logist = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_logist
## [1] 0.6549218

Naive Bayes

os_nb_model = naiveBayes (binned_quality ~ ., data = os_train_df)
os_nb_model
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##         High Intermediate          Low 
##    0.3333333    0.3333333    0.3333333 
## 
## Conditional probabilities:
##               fixed.acidity
## Y                  [,1]      [,2]
##   High         1.901409 0.1148070
##   Intermediate 1.918476 0.1220817
##   Low          1.949177 0.1528686
## 
##               volatile.acidity
## Y                   [,1]      [,2]
##   High         -1.390723 0.3460800
##   Intermediate -1.339210 0.3204440
##   Low          -1.061166 0.3876045
## 
##               citric.acid
## Y                   [,1]      [,2]
##   High         0.3245435 0.0778726
##   Intermediate 0.3371007 0.1261949
##   Low          0.3079582 0.1617614
## 
##               residual.sugar
## Y                  [,1]      [,2]
##   High         1.337543 0.8285368
##   Intermediate 1.549765 0.9327049
##   Low          1.106317 0.9428895
## 
##               chlorides
## Y                   [,1]      [,2]
##   High         -3.300988 0.2666303
##   Intermediate -3.108617 0.3105590
##   Low          -3.075185 0.3467195
## 
##               free.sulfur.dioxide
## Y                  [,1]      [,2]
##   High         3.461398 0.4195593
##   Intermediate 3.465685 0.5300177
##   Low          2.780996 0.7982464
## 
##               total.sulfur.dioxide
## Y                  [,1]     [,2]
##   High         126.0414  50.7710
##   Intermediate 142.7336  43.1156
##   Low          133.7434 296.3143
## 
##               pH
## Y                  [,1]      [,2]
##   High         3.211628 0.1553305
##   Intermediate 3.183485 0.1485863
##   Low          3.191302 0.1664048
## 
##               sulphates
## Y                    [,1]      [,2]
##   High         -0.7166881 0.2587124
##   Intermediate -0.7382040 0.2114988
##   Low          -0.7587270 0.2463432
## 
##               alcohol
## Y                  [,1]       [,2]
##   High         2.426630 0.11687083
##   Intermediate 2.325167 0.10633337
##   Low          2.322142 0.09880537
os_nb_pred = predict(os_nb_model, newdata = test_df, type = "class")
os_nb_prob = predict(os_nb_model, newdata = test_df, type = "raw")
bayes_cm <- confusionMatrix(os_nb_pred, test_df[,11])
bayes_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          163          231   4
##   Intermediate   46          433  13
##   Low             3           66  19
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6288          
##                  95% CI : (0.5977, 0.6592)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3036          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.7689              0.5932    0.52778
## Specificity               0.6932              0.7621    0.92675
## Pos Pred Value            0.4095              0.8801    0.21591
## Neg Pred Value            0.9155              0.3889    0.98090
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1667              0.4427    0.01943
## Detection Prevalence      0.4070              0.5031    0.08998
## Balanced Accuracy         0.7310              0.6776    0.72726
kappa_bayes = bayes_cm$overall[2]
kappa_bayes
##     Kappa 
## 0.3035937
low_sensitivity = bayes_cm$byClass[3,1]
intermediate_sensitivity = bayes_cm$byClass[2,1]
high_sensitivity = bayes_cm$byClass[1,1]

bal_acc_bayes = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_bayes
## [1] 0.6299321

Classification trees

Regular Classification Tree

Let’s first create a fully grown tree before pruning it to the best CP (i.e., CP associated with the minimum standard error):

set.seed(1)
tree1 <- rpart(binned_quality ~ ., data=os_train_df, method = "class", control = rpart.control(cp = 0, minbucket = 2, xval = 5))

# Look at the minimum standard error
printcp(tree1)
## 
## Classification tree:
## rpart(formula = binned_quality ~ ., data = os_train_df, method = "class", 
##     control = rpart.control(cp = 0, minbucket = 2, xval = 5))
## 
## Variables actually used in tree construction:
##  [1] alcohol              chlorides            citric.acid         
##  [4] fixed.acidity        free.sulfur.dioxide  pH                  
##  [7] residual.sugar       sulphates            total.sulfur.dioxide
## [10] volatile.acidity    
## 
## Root node error: 3656/5484 = 0.66667
## 
## n= 5484 
## 
##            CP nsplit rel error  xerror      xstd
## 1  0.18052516      0  1.000000 1.01614 0.0094687
## 2  0.02871991      3  0.458425 0.46827 0.0093860
## 3  0.01413202      4  0.429705 0.43682 0.0092025
## 4  0.00847921      7  0.387309 0.40673 0.0090047
## 5  0.00492341      8  0.378829 0.39278 0.0089052
## 6  0.00437637     10  0.368982 0.38074 0.0088152
## 7  0.00410284     11  0.364606 0.38020 0.0088110
## 8  0.00382932     13  0.356400 0.37062 0.0087365
## 9  0.00369256     15  0.348742 0.36761 0.0087125
## 10 0.00341904     17  0.341357 0.36296 0.0086750
## 11 0.00307713     19  0.334519 0.35175 0.0085820
## 12 0.00273523     23  0.322210 0.34655 0.0085376
## 13 0.00246171     26  0.314004 0.34272 0.0085044
## 14 0.00218818     28  0.309081 0.33835 0.0084659
## 15 0.00209701     35  0.293490 0.33835 0.0084659
## 16 0.00191466     39  0.285011 0.32221 0.0083187
## 17 0.00164114     49  0.265591 0.31865 0.0082851
## 18 0.00150438     60  0.247538 0.30525 0.0081549
## 19 0.00136761     65  0.238239 0.30060 0.0081083
## 20 0.00127644     83  0.213348 0.29458 0.0080468
## 21 0.00109409     86  0.209519 0.28446 0.0079405
## 22 0.00100292    103  0.189278 0.28474 0.0079434
## 23 0.00095733    108  0.183260 0.26422 0.0077163
## 24 0.00094213    110  0.181346 0.26422 0.0077163
## 25 0.00091174    122  0.169858 0.26422 0.0077163
## 26 0.00082057    132  0.160011 0.26395 0.0077131
## 27 0.00072939    164  0.133479 0.26094 0.0076784
## 28 0.00070334    170  0.129103 0.26012 0.0076688
## 29 0.00068381    177  0.124179 0.26012 0.0076688
## 30 0.00063822    190  0.115153 0.24015 0.0074277
## 31 0.00061543    194  0.112418 0.24125 0.0074414
## 32 0.00054705    202  0.107495 0.24043 0.0074311
## 33 0.00043764    255  0.078501 0.23359 0.0073445
## 34 0.00041028    263  0.074398 0.23468 0.0073585
## 35 0.00036470    302  0.056346 0.23441 0.0073550
## 36 0.00032823    305  0.055252 0.23441 0.0073550
## 37 0.00027352    316  0.051422 0.23085 0.0073093
## 38 0.00018235    350  0.042123 0.22839 0.0072773
## 39 0.00013676    353  0.041575 0.22921 0.0072880
## 40 0.00000000    357  0.041028 0.23085 0.0073093
which.min(tree1$cptable[, 4])
## 38 
## 38
# Take the cp associated with the minimum standard error to prune the tree:
set.seed(1)
pruned_tree1 <- prune(tree1, cp = tree1$cptable[which.min(tree1$cptable[, "xerror"]), "CP"])

# If we want to look at the most important variables:
pruned_tree1$variable.importance
## total.sulfur.dioxide              alcohol  free.sulfur.dioxide 
##            1294.0463             697.9393             648.7481 
##            chlorides     volatile.acidity       residual.sugar 
##             536.7401             512.5792             483.5053 
##                   pH            sulphates          citric.acid 
##             437.1729             431.2653             424.1669 
##        fixed.acidity 
##             377.5757
# We can now predict the outcome:
pruned_tree1_pred_test <- predict(pruned_tree1, test_df, type="class")
pruned_tree1_prob_test <- predict(pruned_tree1, test_df, type="prob")
pruned_tree1_cm <- confusionMatrix(pruned_tree1_pred_test, test_df$binned_quality)
pruned_tree1_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          124          114   0
##   Intermediate   83          568  24
##   Low             5           48  12
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7198          
##                  95% CI : (0.6905, 0.7478)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 0.973312        
##                                           
##                   Kappa : 0.3479          
##                                           
##  Mcnemar's Test P-Value : 0.000466        
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.5849              0.7781    0.33333
## Specificity               0.8512              0.5685    0.94374
## Pos Pred Value            0.5210              0.8415    0.18462
## Neg Pred Value            0.8811              0.4653    0.97371
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1268              0.5808    0.01227
## Detection Prevalence      0.2434              0.6902    0.06646
## Balanced Accuracy         0.7180              0.6733    0.63854
kappa_pruned_tree1 = pruned_tree1_cm$overall[2]
kappa_pruned_tree1
##     Kappa 
## 0.3479016
low_sensitivity = pruned_tree1_cm$byClass[3,1]
intermediate_sensitivity = pruned_tree1_cm$byClass[2,1]
high_sensitivity = pruned_tree1_cm$byClass[1,1]
bal_acc_pruned_tree1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_pruned_tree1
## [1] 0.5654404

Boosted tree

Let’s proceed to the Boosted Tree:

set.seed(1)
boost1 <- boosting(binned_quality ~ ., data=os_train_df, control = rpart.control(xval = 5))
boost1_pred_test <- predict(boost1, test_df, type="class")
boost1_prob_test <- predict(boost1, test_df, type="prob")
boost1_cm <- confusionMatrix(as.factor(boost1_pred_test$class), test_df$binned_quality)
boost1_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          158          195   3
##   Intermediate   51          495  18
##   Low             3           40  15
## 
## Overall Statistics
##                                           
##                Accuracy : 0.683           
##                  95% CI : (0.6528, 0.7121)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3511          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.7453              0.6781    0.41667
## Specificity               0.7415              0.7218    0.95435
## Pos Pred Value            0.4438              0.8777    0.25862
## Neg Pred Value            0.9132              0.4324    0.97717
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1616              0.5061    0.01534
## Detection Prevalence      0.3640              0.5767    0.05930
## Balanced Accuracy         0.7434              0.6999    0.68551
kappa_boost1 = boost1_cm$overall[2]
kappa_boost1
##     Kappa 
## 0.3510758
low_sensitivity = boost1_cm$byClass[3,1]
intermediate_sensitivity = boost1_cm$byClass[2,1]
high_sensitivity = boost1_cm$byClass[1,1]
bal_acc_boost1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_boost1
## [1] 0.613344

Bagged tree

Let’s proceed to the Bagged Tree:

set.seed(1)
bag1 <- bagging(binned_quality ~ ., data=os_train_df, control = rpart.control(xval = 5))
bag1_pred_test <- predict(bag1, test_df, type="class")
bag1_prob_test <- predict(bag1, test_df, type="prob")
bag1_cm <- confusionMatrix(as.factor(bag1_pred_test$class), test_df$binned_quality)
bag1_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          170          228   5
##   Intermediate   40          456  18
##   Low             2           46  13
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6534          
##                  95% CI : (0.6226, 0.6832)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3284          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.8019              0.6247    0.36111
## Specificity               0.6958              0.7661    0.94904
## Pos Pred Value            0.4218              0.8872    0.21311
## Neg Pred Value            0.9270              0.4095    0.97492
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1738              0.4663    0.01329
## Detection Prevalence      0.4121              0.5256    0.06237
## Balanced Accuracy         0.7489              0.6954    0.65508
kappa_bag1 = bag1_cm$overall[2]
kappa_bag1
##    Kappa 
## 0.328362
low_sensitivity = bag1_cm$byClass[3,1]
intermediate_sensitivity = bag1_cm$byClass[2,1]
high_sensitivity = bag1_cm$byClass[1,1]
bal_acc_bag1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_bag1
## [1] 0.5958851

Random Forest

Let’s proceed to the Random Forest:

set.seed(1)
rf1 <- randomForest(os_train_df[-11], os_train_df$binned_quality, control = rpart.control(xval = 5))
rf1_pred_test <- predict(rf1, test_df, type="class")
rf1_prob_test <- predict(rf1, test_df, type="prob")
rf1_cm <- confusionMatrix(rf1_pred_test, test_df$binned_quality)
rf1_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          138           79   0
##   Intermediate   73          639  26
##   Low             1           12  10
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8047          
##                  95% CI : (0.7784, 0.8291)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 1.001e-05       
##                                           
##                   Kappa : 0.4964          
##                                           
##  Mcnemar's Test P-Value : 0.09391         
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.6509              0.8753    0.27778
## Specificity               0.8969              0.6008    0.98620
## Pos Pred Value            0.6359              0.8659    0.43478
## Neg Pred Value            0.9028              0.6208    0.97277
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1411              0.6534    0.01022
## Detection Prevalence      0.2219              0.7546    0.02352
## Balanced Accuracy         0.7739              0.7381    0.63199
kappa_rf1 = rf1_cm$overall[2]
kappa_rf1
##     Kappa 
## 0.4963819
low_sensitivity = rf1_cm$byClass[3,1]
intermediate_sensitivity = rf1_cm$byClass[2,1]
high_sensitivity = rf1_cm$byClass[1,1]
bal_acc_rf1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_rf1
## [1] 0.6013545

Neural Nets

As stated earlier in the Data Exploratory Analysis, we will use the neuralnet package. To do so, we will use the oversampled and normalized training set that contains dummies for the outcome variable.

In order to choose the best number of nodes, we will use a loop (2 to 10 nodes) and look at the performance of the model on the validation set (based on the kappa value). We will repeat this operation twice, once with a threshold of 0.1, and once with a threshold of 0.2.

Determination of the best Neural Net with a threshold of 0.5

nn_nodes_1 <- data.frame("number_nodes" = seq(from= 2, to = 10), "kappa_value" = NA, "balanced_accuracy" = NA)

for (i in 2:10){
  set.seed(1)
  nn <- neuralnet(high_quality + intermediate_quality + low_quality ~ ., data = os_train_norm_dummy_df[, -11], hidden = i, linear.output = FALSE, threshold = 0.5)
  valid_pred_nn <- data.frame(predict(nn, valid_norm_dummy_df[, -c(11, 12, 13, 14)]))
  names(valid_pred_nn)[1:3] <- c("High", "Intermediate", "Low")
  valid_pred_nn$Prediction <- NA

  for(j in 1:length(valid_pred_nn$High)) {
  valid_pred_nn[j, 4] <- names(which.max(valid_pred_nn[j, 1:3]))
}
  
  nn.cm <- confusionMatrix(as.factor(valid_pred_nn$Prediction),valid_norm_df$binned_quality)
  nn_nodes_1[i-1, 2] <- nn.cm[["overall"]][["Kappa"]]
  nn_nodes_1[i-1, 3] <- (nn.cm$byClass[3,1] + nn.cm$byClass[2,1] + nn.cm$byClass[1,1]) / 3
}

nn_nodes_1
##   number_nodes kappa_value balanced_accuracy
## 1            2  0.26520018         0.6130463
## 2            3  0.27326774         0.6208693
## 3            4  0.27504687         0.6223886
## 4            5  0.04396906         0.3871228
## 5            6  0.03760601         0.3682718
## 6            7  0.05115352         0.3827653
## 7            8  0.29838043         0.6199154
## 8            9  0.05168019         0.3990183
## 9           10  0.05484551         0.4012064

Best Neural Net

Best Neural Net is with threshold = 0.5 and 4 nodes:

set.seed(1)
nn1 <- neuralnet(high_quality + intermediate_quality + low_quality ~ ., data = train_norm_dummy_df[, -11], hidden = 4, linear.output = FALSE, threshold = 0.5)

test_pred_nn1 <- data.frame(predict(nn1, test_norm_dummy_df[, -c(11, 12, 13, 14)]))
names(test_pred_nn1)[1:3] <- c("High", "Intermediate", "Low")
test_pred_nn1$Prediction <- NA

for(j in 1:length(test_pred_nn1$High)) {
test_pred_nn1[j, 4] <- names(which.max(test_pred_nn1[j, 1:3]))
}

nn1_cm <- confusionMatrix(as.factor(test_pred_nn1$Prediction), test_norm_df$binned_quality)
nn1_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High           70           44   2
##   Intermediate  142          682  30
##   Low             0            4   4
## 
## Overall Statistics
##                                           
##                Accuracy : 0.773           
##                  95% CI : (0.7454, 0.7989)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 0.02935         
##                                           
##                   Kappa : 0.2955          
##                                           
##  Mcnemar's Test P-Value : 7.533e-16       
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity              0.33019              0.9342    0.11111
## Specificity              0.93995              0.3065    0.99575
## Pos Pred Value           0.60345              0.7986    0.50000
## Neg Pred Value           0.83527              0.6129    0.96701
## Prevalence               0.21677              0.7464    0.03681
## Detection Rate           0.07157              0.6973    0.00409
## Detection Prevalence     0.11861              0.8732    0.00818
## Balanced Accuracy        0.63507              0.6203    0.55343
kappa_nn1 = nn1_cm$overall[2]
kappa_nn1
##     Kappa 
## 0.2954988
low_sensitivity = nn1_cm$byClass[3,1]
intermediate_sensitivity = nn1_cm$byClass[2,1]
high_sensitivity = nn1_cm$byClass[1,1]
bal_acc_nn1 = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_nn1
## [1] 0.4585155

Comparing results

results_df = data.frame("method" = c("knn", "logistic regression", "naive Bayes", "classification tree", "boosted tree", "bagged tree", "random forest", "neural net"),
                        "kappa" = c(kappa_best_knn, kappa_logist, kappa_bayes, kappa_pruned_tree1, kappa_boost1, kappa_bag1, kappa_rf1, kappa_nn1),
                        "balanced_accuracy" = c(bal_acc_best_knn, bal_acc_logist, bal_acc_bayes, bal_acc_pruned_tree1, bal_acc_boost1, bal_acc_bag1, bal_acc_rf1, bal_acc_nn1))
results_df
##                method     kappa balanced_accuracy
## 1                 knn 0.3748935         0.6253511
## 2 logistic regression 0.2591899         0.6549218
## 3         naive Bayes 0.3035937         0.6299321
## 4 classification tree 0.3479016         0.5654404
## 5        boosted tree 0.3510758         0.6133440
## 6         bagged tree 0.3283620         0.5958851
## 7       random forest 0.4963819         0.6013545
## 8          neural net 0.2954988         0.4585155
all_kappa_plot = ggplot(data = results_df, aes(x = reorder(method, kappa))) + 
  geom_bar(aes(y = kappa), stat = "identity") +
  labs(title = "Kappa for each method", x = "Method", y = "Kappa") +
  theme(axis.text.x = element_text(angle=90))

all_bal_acc_plot = ggplot(data = results_df, aes(x = reorder(method, balanced_accuracy))) + 
  geom_bar(aes(y = balanced_accuracy), stat = "identity") +
  labs(title = "Balanced accuracy for each method", x = "Method", y = "Balanced accuracy") +
  theme(axis.text.x = element_text(angle=90))

results_plot1 = plot_grid(all_kappa_plot, all_bal_acc_plot)
results_plot1

Ensembles

ensemble_prob_low_df = data.frame("knn_prob_low" = attr(best_knn_pred, "prob")[,3],
                     "log_prob_low" = os_log_prob[,3],
                     "nb_prob_low" = os_nb_prob[,3],
                     "tree_prob_low" = pruned_tree1_prob_test[,3],
                     "boosting_prob_low" = boost1_prob_test$prob[,3],
                     "bagging_prob_low" = bag1_prob_test$prob[,3],
                     "rf_prob_low" = rf1_prob_test[,3],
                     "nnet_prob_low" = test_pred_nn1[,3])
ensemble_prob_low_df$mean_prob_low = apply(ensemble_prob_low_df[,c(1,2,5,7)], 1, mean)
head(ensemble_prob_low_df)
##    knn_prob_low log_prob_low nb_prob_low tree_prob_low boosting_prob_low
## 2             0   0.77309686  0.13981175      0.000000        0.29894777
## 8             0   0.21146901  0.00792725      0.000000        0.13121962
## 10            0   0.22419847  0.02801932      0.000000        0.04075472
## 16            0   0.04869837  0.00276073      0.000000        0.00000000
## 21            0   0.42845514  0.14712599      0.000000        0.05555899
## 24            1   0.90074622  0.81086084      0.952381        0.64561551
##    bagging_prob_low rf_prob_low nnet_prob_low mean_prob_low
## 2              0.55       0.100   0.088932689    0.29301116
## 8              0.00       0.014   0.004140789    0.08917216
## 10             0.00       0.056   0.020970957    0.08023830
## 16             0.00       0.000   0.011903364    0.01217459
## 21             0.00       0.002   0.057560142    0.12150353
## 24             0.44       0.596   0.579583604    0.78559043
ensemble_prob_inter_df = data.frame("knn_prob_inter" = attr(best_knn_pred, "prob")[,2],
                     "log_prob_inter" = os_log_prob[,2],
                     "nb_prob_inter" = os_nb_prob[,2],
                     "tree_prob_inter" = pruned_tree1_prob_test[,2],
                     "boosting_prob_inter" = boost1_prob_test$prob[,2],
                     "bagging_prob_inter" = bag1_prob_test$prob[,2],
                     "rf_prob_inter" = rf1_prob_test[,2],
                     "nnet_prob_inter" = test_pred_nn1[,2])
ensemble_prob_inter_df$mean_prob_inter = apply(ensemble_prob_inter_df[,c(1,2,5,7)], 1, mean)
head(ensemble_prob_inter_df)
##    knn_prob_inter log_prob_inter nb_prob_inter tree_prob_inter
## 2               1     0.20335112    0.68476606      1.00000000
## 8               1     0.66187915    0.92371260      1.00000000
## 10              1     0.40644241    0.55929249      1.00000000
## 16              0     0.25746639    0.11256673      0.86666667
## 21              0     0.29900199    0.05309515      0.00000000
## 24              0     0.09574893    0.18696631      0.04761905
##    boosting_prob_inter bagging_prob_inter rf_prob_inter nnet_prob_inter
## 2            0.6878942               0.45         0.878       0.9445283
## 8            0.6524447               1.00         0.808       0.8849977
## 10           0.4689815               0.00         0.594       0.7708970
## 16           0.3737503               0.00         0.430       0.4913243
## 21           0.2888439               0.00         0.072       0.5156588
## 24           0.3470573               0.56         0.396       0.6006858
##    mean_prob_inter
## 2        0.6923113
## 8        0.7805810
## 10       0.6173560
## 16       0.2653042
## 21       0.1649615
## 24       0.2097016
ensemble_prob_high_df = data.frame("knn_prob_high" = attr(best_knn_pred, "prob")[,1],
                     "log_prob_high" = os_log_prob[,1],
                     "nb_prob_high" = os_nb_prob[,1],
                     "tree_prob_high" = pruned_tree1_prob_test[,1],
                     "boosting_prob_high" = boost1_prob_test$prob[,1],
                     "bagging_prob_high" = bag1_prob_test$prob[,1],
                     "rf_prob_high" = rf1_prob_test[,1],
                     "nnet_prob_high" = test_pred_nn1[,1])
ensemble_prob_high_df$mean_prob_high = apply(ensemble_prob_high_df[,c(1,2,5,7)], 1, mean)
head(ensemble_prob_high_df)
##    knn_prob_high log_prob_high nb_prob_high tree_prob_high boosting_prob_high
## 2              0   0.023552014  0.175422185      0.0000000        0.013158039
## 8              0   0.126651841  0.068360150      0.0000000        0.216335678
## 10             0   0.369359126  0.412688190      0.0000000        0.490263760
## 16             1   0.693835240  0.884672538      0.1333333        0.626249668
## 21             1   0.272542870  0.799778864      1.0000000        0.655597113
## 24             0   0.003504844  0.002172852      0.0000000        0.007327199
##    bagging_prob_high rf_prob_high nnet_prob_high mean_prob_high
## 2                  0        0.022     0.02481099    0.014677513
## 8                  0        0.178     0.10927765    0.130246880
## 10                 1        0.350     0.17698168    0.302405721
## 16                 1        0.570     0.49258641    0.722521227
## 21                 1        0.926     0.39455833    0.713534996
## 24                 0        0.008     0.05135879    0.004708011
ensemble_prob_df = data.frame("Low" = ensemble_prob_low_df$mean_prob_low,
                              "Intermediate" = ensemble_prob_inter_df$mean_prob_inter,
                              "High" = ensemble_prob_high_df$mean_prob_high)
ensemble_prob_df$mean_prob_pred = as.factor(colnames(ensemble_prob_df)[apply(ensemble_prob_df,1,which.max)])
head(ensemble_prob_df)
##          Low Intermediate        High mean_prob_pred
## 1 0.29301116    0.6923113 0.014677513   Intermediate
## 2 0.08917216    0.7805810 0.130246880   Intermediate
## 3 0.08023830    0.6173560 0.302405721   Intermediate
## 4 0.01217459    0.2653042 0.722521227           High
## 5 0.12150353    0.1649615 0.713534996           High
## 6 0.78559043    0.2097016 0.004708011            Low
ensembles_pred_df = data.frame("actual_value" = test_df$binned_quality,
                              "knn_pred" = best_knn_pred,
                              "log_pred" = os_log_pred$pred,
                              "nb_pred" = os_nb_pred,
                              "tree_pred" = pruned_tree1_pred_test,
                              "boosting_pred" = as.factor(boost1_pred_test$class),
                              "bagging_pred" = as.factor(bag1_pred_test$class),
                              "rf_pred" = rf1_pred_test, 
                              "nnet_pred" = as.factor(test_pred_nn1$Prediction))
ensembles_pred_df$majority_vote_pred = as.factor(apply(ensembles_pred_df[,c(2,3,4,6,8)], 1, function(x) names(which.max(table(x)))))
ensembles_pred_df$mean_prob_pred = ensemble_prob_df$mean_prob_pred
head(ensembles_pred_df)
##    actual_value     knn_pred     log_pred      nb_pred    tree_pred
## 2  Intermediate Intermediate          Low Intermediate Intermediate
## 8  Intermediate Intermediate Intermediate Intermediate Intermediate
## 10 Intermediate Intermediate Intermediate Intermediate Intermediate
## 16         High         High         High         High Intermediate
## 21         High         High          Low         High         High
## 24 Intermediate          Low          Low          Low          Low
##    boosting_pred bagging_pred      rf_pred    nnet_pred majority_vote_pred
## 2   Intermediate          Low Intermediate Intermediate       Intermediate
## 8   Intermediate Intermediate Intermediate Intermediate       Intermediate
## 10          High         High Intermediate Intermediate       Intermediate
## 16          High         High         High         High               High
## 21          High         High         High Intermediate               High
## 24           Low Intermediate          Low Intermediate                Low
##    mean_prob_pred
## 2    Intermediate
## 8    Intermediate
## 10   Intermediate
## 16           High
## 21           High
## 24            Low
majority_vote_cm <- confusionMatrix(ensembles_pred_df$majority_vote_pred, test_df[,11])
majority_vote_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          170          166   2
##   Intermediate   39          520  16
##   Low             3           44  18
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7239          
##                  95% CI : (0.6948, 0.7517)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 0.9499          
##                                           
##                   Kappa : 0.4294          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.8019              0.7123    0.50000
## Specificity               0.7807              0.7782    0.95011
## Pos Pred Value            0.5030              0.9043    0.27692
## Neg Pred Value            0.9344              0.4789    0.98028
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1738              0.5317    0.01840
## Detection Prevalence      0.3456              0.5879    0.06646
## Balanced Accuracy         0.7913              0.7453    0.72505
kappa_majority = majority_vote_cm$overall[2]
kappa_majority
##     Kappa 
## 0.4293531
low_sensitivity = majority_vote_cm$byClass[3,1]
intermediate_sensitivity = majority_vote_cm$byClass[2,1]
high_sensitivity = majority_vote_cm$byClass[1,1]

bal_acc_majority = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_majority
## [1] 0.6714052
mean_prob_cm <- confusionMatrix(ensembles_pred_df$mean_prob_pred, test_df[,11])
mean_prob_cm
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     High Intermediate Low
##   High          153          116   2
##   Intermediate   55          561  13
##   Low             4           53  21
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7515          
##                  95% CI : (0.7232, 0.7783)
##     No Information Rate : 0.7464          
##     P-Value [Acc > NIR] : 0.3725          
##                                           
##                   Kappa : 0.4562          
##                                           
##  Mcnemar's Test P-Value : 4.087e-10       
## 
## Statistics by Class:
## 
##                      Class: High Class: Intermediate Class: Low
## Sensitivity               0.7217              0.7685    0.58333
## Specificity               0.8460              0.7258    0.93949
## Pos Pred Value            0.5646              0.8919    0.26923
## Neg Pred Value            0.9165              0.5158    0.98333
## Prevalence                0.2168              0.7464    0.03681
## Detection Rate            0.1564              0.5736    0.02147
## Detection Prevalence      0.2771              0.6431    0.07975
## Balanced Accuracy         0.7838              0.7471    0.76141
kappa_mean_prob = mean_prob_cm$overall[2]
kappa_mean_prob
##     Kappa 
## 0.4562365
low_sensitivity = mean_prob_cm$byClass[3,1]
intermediate_sensitivity = mean_prob_cm$byClass[2,1]
high_sensitivity = mean_prob_cm$byClass[1,1]

bal_acc_mean_prob = (low_sensitivity + intermediate_sensitivity + high_sensitivity) / 3
bal_acc_mean_prob
## [1] 0.6911749

Comparing results (including Ensembles)

results_df[9,] = list("majority vote", kappa_majority, bal_acc_majority)
results_df[10,] = list("mean probability", kappa_mean_prob, bal_acc_mean_prob)
all_kappa_plot2 = ggplot(data = results_df, aes(x = reorder(method, kappa))) + 
  geom_bar(aes(y = kappa), stat = "identity") +
  labs(title = "Kappa for each method", x = "Method", y = "Kappa") +
  theme(axis.text.x = element_text(angle=90))

all_bal_acc_plot2 = ggplot(data = results_df, aes(x = reorder(method, balanced_accuracy))) + 
  geom_bar(aes(y = balanced_accuracy), stat = "identity") +
  labs(title = "Balanced accuracy for each method", x = "Method", y = "Balanced accuracy") +
  theme(axis.text.x = element_text(angle=90))

results_plot2 = plot_grid(all_kappa_plot2, all_bal_acc_plot2)
results_plot2

Visualising Confusion Matrices for each method

We first create the data-frames associated to the confusion matrices of each method:

plot_knn_cm <- as.data.frame(best_knn_cm$table)
plot_knn_cm$Prediction <- factor(plot_knn_cm$Prediction, levels=rev(levels(plot_knn_cm$Prediction)))
plot_knn_cm <- plot_knn_cm %>%
  mutate(pred = ifelse(plot_knn_cm$Prediction == plot_knn_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))


plot_os_log_cm <- as.data.frame(os_log_cm$table)
plot_os_log_cm$Prediction <- factor(plot_os_log_cm$Prediction, levels=rev(levels(plot_os_log_cm$Prediction)))
plot_os_log_cm <- plot_os_log_cm %>%
  mutate(pred = ifelse(plot_os_log_cm$Prediction == plot_os_log_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

plot_bayes_cm <- as.data.frame(bayes_cm$table)
plot_bayes_cm$Prediction <- factor(plot_bayes_cm$Prediction, levels=rev(levels(plot_bayes_cm$Prediction)))
plot_bayes_cm <- plot_bayes_cm %>%
  mutate(pred = ifelse(plot_bayes_cm$Prediction == plot_bayes_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

plot_pruned_tree1_cm <- as.data.frame(pruned_tree1_cm$table)
plot_pruned_tree1_cm$Prediction <- factor(plot_pruned_tree1_cm$Prediction, levels=rev(levels(plot_pruned_tree1_cm$Prediction)))
plot_pruned_tree1_cm <- plot_pruned_tree1_cm %>%
  mutate(pred = ifelse(plot_pruned_tree1_cm$Prediction == plot_pruned_tree1_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

plot_boost1_cm <- as.data.frame(boost1_cm$table)
plot_boost1_cm$Prediction <- factor(plot_boost1_cm$Prediction, levels=rev(levels(plot_boost1_cm$Prediction)))
plot_boost1_cm <- plot_boost1_cm %>%
  mutate(pred = ifelse(plot_boost1_cm$Prediction == plot_boost1_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

plot_bag1_cm <- as.data.frame(bag1_cm$table)
plot_bag1_cm$Prediction <- factor(plot_bag1_cm$Prediction, levels=rev(levels(plot_bag1_cm$Prediction)))
plot_bag1_cm <- plot_bag1_cm %>%
  mutate(pred = ifelse(plot_bag1_cm$Prediction == plot_bag1_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

plot_rf1_cm <- as.data.frame(rf1_cm$table)
plot_rf1_cm$Prediction <- factor(plot_rf1_cm$Prediction, levels=rev(levels(plot_rf1_cm$Prediction)))
plot_rf1_cm <- plot_rf1_cm %>%
  mutate(pred = ifelse(plot_rf1_cm$Prediction == plot_rf1_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

plot_nn1_cm <- as.data.frame(nn1_cm$table)
plot_nn1_cm$Prediction <- factor(plot_nn1_cm$Prediction, levels=rev(levels(plot_nn1_cm$Prediction)))
plot_nn1_cm <- plot_nn1_cm %>%
  mutate(pred = ifelse(plot_nn1_cm$Prediction == plot_nn1_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

plot_majority_vote_cm <- as.data.frame(majority_vote_cm$table)
plot_majority_vote_cm$Prediction <- factor(plot_majority_vote_cm$Prediction, levels=rev(levels(plot_majority_vote_cm$Prediction)))
plot_majority_vote_cm <- plot_majority_vote_cm %>%
  mutate(pred = ifelse(plot_majority_vote_cm$Prediction == plot_majority_vote_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

plot_mean_prob_cm <- as.data.frame(mean_prob_cm$table)
plot_mean_prob_cm$Prediction <- factor(plot_mean_prob_cm$Prediction, levels=rev(levels(plot_mean_prob_cm$Prediction)))
plot_mean_prob_cm <- plot_mean_prob_cm %>%
  mutate(pred = ifelse(plot_mean_prob_cm$Prediction == plot_mean_prob_cm$Reference, "correct", "error")) %>%
  group_by(Reference) %>%
  mutate(prop = Freq/sum(Freq))

Let’s create the plots:

plot_knn_cm <- ggplot(plot_knn_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
        geom_tile() + 
        theme_bw() + 
        geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
        scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
        xlim((levels(plot_knn_cm$Reference))) +
        labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: kNN")
plot_knn_cm

plot_os_log_cm <- ggplot(plot_os_log_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_os_log_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Logistic Regression")
plot_os_log_cm

plot_bayes_cm <- ggplot(plot_bayes_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_bayes_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Naive Bayes")
plot_bayes_cm

plot_pruned_tree1_cm <- ggplot(plot_pruned_tree1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_pruned_tree1_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Pruned Tree")
plot_pruned_tree1_cm

plot_boost1_cm <- ggplot(plot_boost1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_boost1_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Boosted Tree")
plot_boost1_cm

plot_bag1_cm <- ggplot(plot_bag1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_bag1_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Bagged Tree")
plot_bag1_cm

plot_rf1_cm <- ggplot(plot_rf1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_rf1_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Random Forest")
plot_rf1_cm

plot_nn1_cm <- ggplot(plot_nn1_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_nn1_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Neural Net")
plot_nn1_cm 

plot_majority_vote_cm <- ggplot(plot_majority_vote_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_majority_vote_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Majority Vote (Ensembles)")
plot_majority_vote_cm

plot_mean_prob_cm <- ggplot(plot_mean_prob_cm, aes(Reference, Prediction, fill = pred, alpha = prop)) +
    geom_tile() + 
    theme_bw() + 
    geom_text(aes(label=Freq), vjust = .5, fontface  = "bold", alpha = 1) +
    scale_fill_manual(values = c(correct = "#5DADE2", error = "#EC7063")) +
    xlim((levels(plot_mean_prob_cm$Reference))) +
    labs(x = "Reference",y = "Prediction", title = "Confusion Matrix: Mean Probability (Ensembles)")
plot_mean_prob_cm